In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
# Load data
train_data = pd.read_csv('/content/drive/MyDrive/colab_notebooks/Data/main_dataset.csv')
train_data['Id'] = np.where(train_data['Id'] < 1e-18, 1e-18, train_data['Id'])
train_data['Log_Id'] = np.log10(train_data['Id'])
X = train_data[['Tox', 'Nc', 'Nd', 'Ns', 'Vds', 'Vgs']]
y = train_data['Log_Id']
# Polynomial features and scaling
poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
# Define hyperparameter grid for DecisionTreeRegressor
param_grid = {
'max_depth': [3, 5, 10, 15,None],
'min_samples_split': [2, 5, 10, 50, 100],
'min_samples_leaf': [1, 2, 4, 10],
'max_features': [None, 'sqrt', 'log2'],
}
# DataFrame to store results
results_df = pd.DataFrame(columns=['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features', 'R2', 'MAE'])
random_search = RandomizedSearchCV(
DecisionTreeRegressor(random_state=42),
param_grid,
n_iter=40,
random_state=42,
n_jobs=-1,
verbose = 5,
cv = 3,
)
random_search.fit(X_train, y_train)
Fitting 3 folds for each of 40 candidates, totalling 120 fits
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-7-d7f75b402af4> in <cell line: 49>() 91 plt.show() 92 ---> 93 current_result = pd.DataFrame({ 94 'max_depth': params['max_depth'], 95 'min_samples_split': params['min_samples_split'], /usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy) 776 elif isinstance(data, dict): 777 # GH#38939 de facto copy defaults to False only in non-dict cases --> 778 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) 779 elif isinstance(data, ma.MaskedArray): 780 from numpy.ma import mrecords /usr/local/lib/python3.10/dist-packages/pandas/core/internals/construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy) 501 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] 502 --> 503 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) 504 505 /usr/local/lib/python3.10/dist-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, columns, index, dtype, verify_integrity, typ, consolidate) 112 # figure out the index, if necessary 113 if index is None: --> 114 index = _extract_index(arrays) 115 else: 116 index = ensure_index(index) /usr/local/lib/python3.10/dist-packages/pandas/core/internals/construction.py in _extract_index(data) 665 666 if not indexes and not raw_lengths: --> 667 raise ValueError("If using all scalar values, you must pass an index") 668 669 if have_series: ValueError: If using all scalar values, you must pass an index
In [15]:
# Loop through each hyperparameter set
for params in random_search.cv_results_['params']:
# Set up the model with the current parameters
model = DecisionTreeRegressor(random_state=42, **params)
model.fit(X_train, y_train)
print(params)
# Test on the first test dataset
y_pred_test = model.predict(X_test)
r2 = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
# Load and preprocess the second test data
test_data = pd.read_csv('/content/drive/MyDrive/colab_notebooks/Data/test_data.csv').iloc[0:203]
test_data['Id'] = np.where(test_data['Id'] < 1e-18, 1e-18, test_data['Id'])
test_data['Log_Id'] = np.log10(test_data['Id'])
X_test_1 = test_data[['Tox', 'Nc', 'Nd', 'Ns', 'Vds', 'Vgs']]
y_test_1 = test_data['Log_Id']
X_test_1_transformed = poly.transform(X_test_1)
X_test_1_scaled = scaler.transform(X_test_1_transformed)
# Predict on the second test dataset and evaluate
y_pred_test_1 = model.predict(X_test_1_scaled)
r2_test_1 = r2_score(y_test_1, y_pred_test_1)
mae_test_1 = mean_absolute_error(y_test_1, y_pred_test_1)
# Log scale plot
plt.figure(figsize=(10, 5))
plt.plot(X_test_1['Vgs'], y_pred_test_1, color="green", label="Predicted")
plt.plot(X_test_1['Vgs'], y_test_1, color="blue", label="Actual")
plt.title(f'Vgs vs Id (Log scale) - Params: {params}')
plt.xlabel('Vgs')
plt.ylabel('Log10(Id)')
plt.legend()
plt.show()
# Linear scale plot
plt.figure(figsize=(10, 5))
plt.plot(X_test_1['Vgs'], np.maximum(np.power(10, y_pred_test_1), 1e-18), color="green", label="Predicted")
plt.plot(X_test_1['Vgs'], np.maximum(np.power(10, y_test_1), 1e-18), color="blue", label="Actual")
plt.title(f'Vgs vs Id (Linear scale) - Params: {params}')
plt.xlabel('Vgs')
plt.ylabel('Id')
plt.legend()
plt.show()
current_result = pd.DataFrame({
'max_depth': params['max_depth'],
'min_samples_split': params['min_samples_split'],
'min_samples_leaf': params['min_samples_leaf'],
'max_features': params['max_features'],
'R2': r2_test_1,
'MAE': mae_test_1
}, index = [0])
# Save to DataFrame
results_df = pd.concat([results_df, current_result], ignore_index=True)
print("-----------\n--------\n----------\n----------")
# Display results
print(results_df)
{'min_samples_split': 50, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None}
-----------
--------
----------
----------
{'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10}
-----------
--------
----------
----------
{'min_samples_split': 100, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 50, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 100, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 5}
-----------
--------
----------
----------
{'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'log2', 'max_depth': 10}
-----------
--------
----------
----------
{'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'log2', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'log2', 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 50, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 100, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None}
-----------
--------
----------
----------
{'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 50, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}
-----------
--------
----------
----------
{'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 10}
-----------
--------
----------
----------
{'min_samples_split': 50, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 5}
-----------
--------
----------
----------
{'min_samples_split': 50, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5}
-----------
--------
----------
----------
{'min_samples_split': 100, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': None}
-----------
--------
----------
----------
{'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': None}
-----------
--------
----------
----------
{'min_samples_split': 100, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5}
-----------
--------
----------
----------
{'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}
-----------
--------
----------
----------
{'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'log2', 'max_depth': None}
-----------
--------
----------
----------
{'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 10}
-----------
--------
----------
----------
{'min_samples_split': 50, 'min_samples_leaf': 10, 'max_features': 'log2', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 100, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}
-----------
--------
----------
----------
{'min_samples_split': 50, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 100, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 15}
-----------
--------
----------
----------
{'min_samples_split': 100, 'min_samples_leaf': 10, 'max_features': 'log2', 'max_depth': 5}
-----------
--------
----------
----------
{'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 5}
-----------
--------
----------
----------
{'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 3}
-----------
--------
----------
----------
{'min_samples_split': 50, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 5}
-----------
--------
----------
----------
max_depth min_samples_split min_samples_leaf max_features \
0 15 50 1 sqrt
1 None 5 2 sqrt
2 10 10 4 sqrt
3 3 100 2 None
4 15 50 4 log2
.. ... ... ... ...
126 5 100 10 log2
127 3 10 2 None
128 5 2 4 sqrt
129 3 5 2 log2
130 5 50 4 None
min_impurity_decrease R2 MAE
0 NaN 0.985844 0.291017
1 NaN 0.984648 0.289155
2 NaN 0.968032 0.528280
3 NaN 0.905205 0.926902
4 NaN 0.985877 0.305633
.. ... ... ...
126 NaN 0.905308 1.016112
127 NaN 0.905205 0.926902
128 NaN 0.985791 0.383561
129 NaN 0.193860 2.536882
130 NaN 0.977076 0.388789
[131 rows x 7 columns]
In [3]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [11]:
print(results_df)
results_df.to_csv('/content/dt_hyp_tuning_results.csv')
max_depth min_samples_split min_samples_leaf max_features \ 0 15 50 1 sqrt 1 None 5 2 sqrt 2 10 10 4 sqrt 3 3 100 2 None 4 15 50 4 log2 .. ... ... ... ... 66 5 100 10 log2 67 3 10 2 None 68 5 2 4 sqrt 69 3 5 2 log2 70 5 50 4 None min_impurity_decrease R2 MAE 0 NaN 0.985844 0.291017 1 NaN 0.984648 0.289155 2 NaN 0.968032 0.528280 3 NaN 0.905205 0.926902 4 NaN 0.985877 0.305633 .. ... ... ... 66 NaN 0.905308 1.016112 67 NaN 0.905205 0.926902 68 NaN 0.985791 0.383561 69 NaN 0.193860 2.536882 70 NaN 0.977076 0.388789 [71 rows x 7 columns]